In [ ]:
import pandas as pd
import numpy as np
import numerapi
import os
import plotly.express as px
import plotly.graph_objects as go
import catboost
import optuna
In [ ]:
training_set = pd.read_parquet("data/numerai_training_data.parquet")
feature_names = [f for f in training_set.columns if "feature_" in f]
In [ ]:
eras = training_set.era.unique()

NUM_FOLDS = 5
FOLD_SIZE = int(len(eras) / NUM_FOLDS)
EMBAGO_SIZE = 64
In [ ]:
# generate splits
splits_df = pd.DataFrame({
    'era': eras,
}).set_index("era") # list of tuples of validation and training eras

step_size = (len(eras) - (FOLD_SIZE + 2 * EMBAGO_SIZE)) // (NUM_FOLDS - 1)
for i in range(NUM_FOLDS):
    start = i * step_size
    end = start + FOLD_SIZE + 2 * EMBAGO_SIZE
    validation_eras = eras[(start + EMBAGO_SIZE):(end - EMBAGO_SIZE)]

    training_eras = []
    if start == 0:
        training_eras = eras[end:]
    else:
        training_eras = eras[:start]
        training_eras = np.concatenate([training_eras, eras[end:]])

    splits_df['split_{}'.format(i)] = "embargo"
    splits_df.loc[validation_eras, 'split_{}'.format(i)] = "validation"
    splits_df.loc[training_eras, 'split_{}'.format(i)] = "training"

# display splits with plotly table
splits_df
Out[ ]:
split_0 split_1 split_2 split_3 split_4
era
0001 embargo training training training training
0002 embargo training training training training
0003 embargo training training training training
0004 embargo training training training training
0005 embargo training training training training
... ... ... ... ... ...
0570 training training training training embargo
0571 training training training training embargo
0572 training training training training embargo
0573 training training training training embargo
0574 training training training training embargo

574 rows × 5 columns

In [ ]:
# plot splits_df with plotly
from plotly.colors import n_colors

# make table 2000px tall
splits = [s for s in splits_df.columns.tolist() if "split" in s]
fig = go.Figure(data=[go.Table(
    header=dict(values=["<b>Era<b>"] + ['<b>Split {}<b>'.format(i) for i in range(NUM_FOLDS)]),
    cells=dict(values=[eras] + [splits_df['split_{}'.format(i)] for i in range(NUM_FOLDS)],
        # colors=['#FF0000', '#00FF00', '#0000FF'],
        fill_color = [
            '#FF0000' if "embargo" in splits_df['split_{}'.format(i)] else '#FFFFFF' 
            for i in range(NUM_FOLDS)
        ],
        align='center',
    ),
    )
])

fig.show(renderer='notebook')
In [ ]:
from scipy.stats import gmean

# hyper parameter tuning with optuna
def objective(trial):

    # suggest hyper parameters to try at each iteration
    params = {
        "iterations" : trial.suggest_int("iterations", 100, 2000),
        "learning_rate" : trial.suggest_loguniform("learning_rate", 1e-4, 1),
        "depth" : trial.suggest_int("depth", 4, 10),
        "task_type" : "GPU",
    }

    all_correlations = []

    # loop over each cross validation fold
    for split in splits:
        train_eras = splits_df.loc[splits_df[split] == "training"].index
        validation_eras = splits_df.loc[splits_df[split] == "validation"].index

        # creat model with same parameters each fold
        model = catboost.CatBoostRegressor(**params)
        model.fit(
            X=training_set.loc[training_set.era.isin(train_eras)][feature_names],
            y=training_set.loc[training_set.era.isin(train_eras)]["target"],
            verbose=False
            )
        # make predictions on validation fold
        preds_df = pd.DataFrame(
            {
                "prediction" : model.predict(
                training_set.loc[training_set.era.isin(validation_eras)][feature_names],
                verbose=False,
                ),
                "era" : training_set.loc[training_set.era.isin(validation_eras)]["era"],
                "target" : training_set.loc[training_set.era.isin(validation_eras)]["target"],
            },
            index=training_set.loc[training_set.era.isin(validation_eras)].index, 
        )

        # calculate correlation between prediction and target grouped by era
        era_correlations = preds_df.groupby("era").apply(
            lambda era: np.corrcoef(era["prediction"], era["target"])[0, 1]
        )

        # mean accross all eras
        mean_correlation = era_correlations.mean()

        # append to list of all correlations
        all_correlations.append(mean_correlation)

    # some splits tend to have higher correlation than others
    # geometric mean prevents these splits from skewing the results
    geometric_mean_correlation = gmean(all_correlations)

    return geometric_mean_correlation

# hyper parameter optimization with random search
study = optuna.create_study(
    direction="maximize",
    study_name="catboost_hyper_parameter_tuning",
    sampler=optuna.samplers.RandomSampler(seed=42),
    )

# 20 trials
study.optimize(objective, n_trials=30)
[I 2022-05-02 20:39:18,361] A new study created in memory with name: catboost_hyper_parameter_tuning
[I 2022-05-02 20:48:08,244] Trial 0 finished with value: 0.015598954058755022 and parameters: {'iterations': 812, 'learning_rate': 0.6351221010640696, 'depth': 9}. Best is trial 0 with value: 0.015598954058755022.
[I 2022-05-02 20:53:24,378] Trial 1 finished with value: 0.03955272838067788 and parameters: {'iterations': 1238, 'learning_rate': 0.00042079886696066364, 'depth': 5}. Best is trial 1 with value: 0.03955272838067788.
[I 2022-05-02 20:55:46,873] Trial 2 finished with value: 0.035781989104643266 and parameters: {'iterations': 210, 'learning_rate': 0.29154431891537513, 'depth': 8}. Best is trial 1 with value: 0.03955272838067788.
[I 2022-05-02 21:13:46,587] Trial 3 finished with value: 0.04383621612161705 and parameters: {'iterations': 1446, 'learning_rate': 0.00012087541473056971, 'depth': 10}. Best is trial 3 with value: 0.04383621612161705.
[I 2022-05-02 21:20:29,875] Trial 4 finished with value: 0.04329294203991884 and parameters: {'iterations': 1682, 'learning_rate': 0.0007068974950624604, 'depth': 5}. Best is trial 3 with value: 0.04383621612161705.
[I 2022-05-02 21:24:07,137] Trial 5 finished with value: 0.044841446893135364 and parameters: {'iterations': 448, 'learning_rate': 0.0016480446427978971, 'depth': 7}. Best is trial 5 with value: 0.044841446893135364.
[I 2022-05-02 21:32:20,293] Trial 6 finished with value: 0.0488792949514575 and parameters: {'iterations': 921, 'learning_rate': 0.0014618962793704966, 'depth': 8}. Best is trial 6 with value: 0.0488792949514575.
[I 2022-05-02 21:34:46,175] Trial 7 finished with value: 0.041903476970997056 and parameters: {'iterations': 365, 'learning_rate': 0.0014742753159914669, 'depth': 6}. Best is trial 6 with value: 0.0488792949514575.
[I 2022-05-02 21:38:47,358] Trial 8 finished with value: 0.05022553670821227 and parameters: {'iterations': 966, 'learning_rate': 0.13826232179369857, 'depth': 5}. Best is trial 8 with value: 0.05022553670821227.
[I 2022-05-02 21:42:22,492] Trial 9 finished with value: 0.05568186814182655 and parameters: {'iterations': 1077, 'learning_rate': 0.0234238498471129, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 21:46:24,147] Trial 10 finished with value: 0.03764316492949973 and parameters: {'iterations': 1254, 'learning_rate': 0.00048094619675015767, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 22:05:09,777] Trial 11 finished with value: 0.016769418371694763 and parameters: {'iterations': 1903, 'learning_rate': 0.7286653737491037, 'depth': 9}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 22:11:06,046] Trial 12 finished with value: 0.0417021103285943 and parameters: {'iterations': 679, 'learning_rate': 0.00024586032763280086, 'depth': 8}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 22:17:08,117] Trial 13 finished with value: 0.041413498008434316 and parameters: {'iterations': 936, 'learning_rate': 0.00030771802712506853, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 22:18:25,014] Trial 14 finished with value: 0.044528835955442506 and parameters: {'iterations': 165, 'learning_rate': 0.43379206974909373, 'depth': 5}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 22:26:38,172] Trial 15 finished with value: 0.05085371345839203 and parameters: {'iterations': 1359, 'learning_rate': 0.0017654048052495078, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 22:40:25,813] Trial 16 finished with value: 0.04777174383159533 and parameters: {'iterations': 1139, 'learning_rate': 0.0005488047000766049, 'depth': 10}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:01:31,490] Trial 17 finished with value: 0.018877487093125016 and parameters: {'iterations': 1573, 'learning_rate': 0.5727904470799616, 'depth': 10}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:06:07,496] Trial 18 finished with value: 0.033435831845056926 and parameters: {'iterations': 1236, 'learning_rate': 0.48696409415208936, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:09:34,691] Trial 19 finished with value: 0.03742721363803709 and parameters: {'iterations': 472, 'learning_rate': 0.00015167330688076205, 'depth': 6}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:20:10,300] Trial 20 finished with value: 0.048779674492417 and parameters: {'iterations': 838, 'learning_rate': 0.0012172958098369967, 'depth': 9}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:26:14,786] Trial 21 finished with value: 0.04639859105643161 and parameters: {'iterations': 778, 'learning_rate': 0.0013296521457299515, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:28:00,399] Trial 22 finished with value: 0.053253665181782046 and parameters: {'iterations': 367, 'learning_rate': 0.16172900811143134, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:35:51,391] Trial 23 finished with value: 0.047606668662971446 and parameters: {'iterations': 1976, 'learning_rate': 0.12273800987852965, 'depth': 5}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:37:23,990] Trial 24 finished with value: 0.04465505855161698 and parameters: {'iterations': 110, 'learning_rate': 0.18274508859816008, 'depth': 8}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:42:07,021] Trial 25 finished with value: 0.0510479129275434 and parameters: {'iterations': 1485, 'learning_rate': 0.12164139351417062, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:52:31,984] Trial 26 finished with value: 0.04420323540665883 and parameters: {'iterations': 781, 'learning_rate': 0.00029072088906598463, 'depth': 10}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-02 23:56:55,996] Trial 27 finished with value: 0.04613187748558095 and parameters: {'iterations': 1284, 'learning_rate': 0.00210664860170422, 'depth': 4}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-03 00:04:20,395] Trial 28 finished with value: 0.05017677899308446 and parameters: {'iterations': 691, 'learning_rate': 0.0019986340778528873, 'depth': 9}. Best is trial 9 with value: 0.05568186814182655.
[I 2022-05-03 00:12:59,248] Trial 29 finished with value: 0.028405286388197913 and parameters: {'iterations': 1311, 'learning_rate': 0.35387588647792356, 'depth': 7}. Best is trial 9 with value: 0.05568186814182655.
In [ ]:
# best params and values
print(study.best_params)
print(study.best_value)
{'iterations': 1077, 'learning_rate': 0.0234238498471129, 'depth': 4}
0.05568186814182655
In [ ]:
fig = optuna.visualization.plot_parallel_coordinate(study)
fig.show(renderer = "notebook")
In [ ]:
fig = optuna.visualization.plot_param_importances(study)
fig.show(renderer='notebook')